

/*=====================================================================
                =======   COPYRIGHT NOTICE   =======
Copyright (C) 1996, Carnegie Mellon University, Cambridge University,
Ronald Rosenfeld and Philip Clarkson.

All rights reserved.

This software is made available for research purposes only.  It may be
redistributed freely for this purpose, in full or in part, provided
that this entire copyright notice is included on any copies of this
software and applications and derivations thereof.

This software is provided on an "as is" basis, without warranty of any
kind, either expressed or implied, as to any matter including, but not
limited to warranty of fitness of purpose, or merchantability, or
results obtained from use of this software.
======================================================================*/


#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "toolkit.h"
#include "rr_libs/general.h"
#include "pc_libs/pc_general.h"

#define DEFAULT_MAX_RECORDS 3000000

typedef struct {
  char *word;
  int count;
} word_rec;

int sort_by_count(const void *rec1,const void *rec2) {

  word_rec *r1;
  word_rec *r2;
  
  r1 = (word_rec *) rec1;
  r2 = (word_rec *) rec2;

  return(r2->count-r1->count);

}		  

int sort_alpha(const void *rec1,const void *rec2) {
 
  word_rec *r1;
  word_rec *r2;
 
  char *s1;
  char *s2;
  
  r1 = (word_rec *) rec1;
  r2 = (word_rec *) rec2;
  
  s1 = r1->word;
  s2 = r2->word;

  return (strcmp(s1,s2));
 
}

/***************************
      MAIN FUNCTION
 ***************************/

void main(int argc, char *argv[]) {

  int verbosity;
  int vocab_size;
  int cutoff;
  int num_recs;
  int current_rec;
  int num_above_threshold;
  int num_to_output;
  int i;
  word_rec *records;
  char temp_word[750];
  flag gt_set;
  flag top_set;

  /* Process command line */

  report_version(&argc,argv);

  if (pc_flagarg( &argc, argv,"-help")) {
    fprintf(stderr,"wfreq2vocab : Generate a vocabulary file from a word frequency file.\n");
    fprintf(stderr,"Usage : wfreq2vocab [ -top 20000 | -gt 10]\n");
    fprintf(stderr,"                    [ -records %d ]\n",DEFAULT_MAX_RECORDS);
    fprintf(stderr,"                    [ -verbosity %d]\n",DEFAULT_VERBOSITY);
    fprintf(stderr,"                    < .wfreq > .vocab\n");
    exit(1);
  }

  cutoff = pc_intarg( &argc, argv, "-gt",-1);
  vocab_size = pc_intarg(&argc, argv, "-top",-1);
  num_recs = pc_intarg(&argc, argv, "-records",DEFAULT_MAX_RECORDS);
  verbosity = pc_intarg(&argc, argv, "-verbosity",DEFAULT_VERBOSITY);
  
  pc_report_unk_args(&argc,argv,verbosity);

  if (cutoff != -1) {
    gt_set = 1;
  }
  else {
    gt_set = 0;
    cutoff = 0;
  }

  if (vocab_size != -1) {
    top_set = 1;
  }
  else {
    top_set = 0;
    vocab_size = 0;
  }
  
  if (gt_set && top_set) {
    quit(-1,"wfreq2vocab : Error : Can't use both the -top and the -gt options.\n");
  }



  if (!gt_set && !top_set) {
    vocab_size = 20000;
  }

  if (gt_set) {
    pc_message(verbosity,2,"wfreq2vocab : Will generate a vocabulary containing all words which\n              occurred more that %d times. Reading wfreq stream from stdin...\n",cutoff);
  }
  else {
    pc_message(verbosity,2,"wfreq2vocab : Will generate a vocabulary containing the most\n              frequent %d words. Reading wfreq stream from stdin...\n",vocab_size);
  }


  records = (word_rec *) rr_malloc(sizeof(word_rec)*num_recs);

  current_rec = 0;
  num_above_threshold = 0;
  
  while (!rr_feof(stdin)) {

    if (scanf("%s %d",temp_word,&(records[current_rec].count)) != 2) {
      if (!rr_feof(stdin)) {
	quit(-1,"Error reading unigram counts from standard input.\n");
      }
    }
    else {
      records[current_rec].word = salloc(temp_word);
      if (gt_set && records[current_rec].count > cutoff) {
	num_above_threshold++;
      }
      current_rec++;
    }
  }

  /* Sort records in descending order of count */

  qsort((void*) records,(size_t) current_rec, sizeof(word_rec),sort_by_count);

  if (gt_set) {
    num_to_output = num_above_threshold;
  }
  else {
    num_to_output = vocab_size;
  }

  if (current_rec<num_to_output) {
    num_to_output = current_rec;
  }

  /* Now sort the relevant records alphabetically */

  qsort((void*) records,(size_t) num_to_output, sizeof(word_rec),sort_alpha);

  if (gt_set) {
    pc_message(verbosity,2,"Size of vocabulary = %d\n",num_to_output);
  }
  
  if (num_to_output>65535) {
    pc_message(verbosity,1,"Warning : Vocab size exceeds 65535. This will cause problems with \nother tools, since word id's are stored in 2 bytes.\n");
  }

  /* Print the vocab to stdout */
  
  printf("## Vocab generated by v2 of the CMU-Cambridge Statistcal\n");
  printf("## Language Modeling toolkit.\n");
  printf("##\n");
  printf("## Includes %d words ",num_to_output);
  printf("##\n");

  for (i=0;i<=num_to_output-1;i++) {
    printf("%s\n",records[i].word);
  }

  pc_message(verbosity,0,"wfreq2vocab : Done.\n");

  exit(0);

}  
    



